Run ACF and PACF analysis to select the right amount of lag
import datetime
import os
import pandas as pd
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
sys.path.append("../src/")
from prepare_data import set_type, split_train_test, check_split
from feature_engineering import extract_date_features, add_lockdown_periods, add_holidays_period, add_lags_sma
%load_ext autoreload
%autoreload 2
path_data = '../data/raw/'
feat_date = "Date"
df = pd.read_csv(os.path.join(path_data, "dataset.gz"), sep=";")
df.sample(5)
| Date | Fourni | Ventes | id | |
|---|---|---|---|---|
| 67459 | 2022-08-11 | 8.0 | 8.0 | -8793000144743990337 |
| 48519 | 2021-10-27 | 7.0 | 4.0 | 2942031581314612067 |
| 44782 | 2021-09-03 | 10.0 | 6.0 | 6183672341764466493 |
| 78372 | 2023-01-30 | 10.0 | 5.0 | 1129067540243158717 |
| 71371 | 2022-10-14 | 7.0 | 3.0 | -1139387161625682197 |
df = set_type(df, feat_date=feat_date)
df_analyze = df[(df["Ventes"].notnull())].groupby([feat_date, "id"])["Ventes"].mean().reset_index().set_index(feat_date)
for num, i in enumerate(df_analyze["id"].unique()):
try:
fig, ax = plt.subplots(1,2,figsize=(15,3))
temp = df_analyze[(df_analyze["id"] == i)]
sm.graphics.tsa.plot_acf(temp["Ventes"], lags=150, ax=ax[0], title = "AUTOCORRELATION\n" + i)
sm.graphics.tsa.plot_pacf(temp["Ventes"], lags=150, ax=ax[1], title = "PARTIAL AUTOCORRELATION\n" + i)
except:
pass
/var/folders/q1/n9jt1sqj3976rqf2zn88_qpc0000gn/T/ipykernel_87722/25242624.py:4: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. fig, ax = plt.subplots(1,2,figsize=(15,3))
Focus on the PACF for the top seller in order to have a clearer view
top_id = "-8295888135039322961"
df_analyze_id = df[(df["Ventes"].notnull()) & (df["id"] == top_id)].groupby([feat_date])["Ventes"].mean().reset_index().set_index(feat_date)
fig, ax = plt.subplots(figsize=(15,5))
sm.graphics.tsa.plot_acf(df_analyze_id["Ventes"], lags=150, ax=ax, title = "AUTOCORRELATION\n" + top_id)
plt.show()
fig, ax = plt.subplots(figsize=(15,5))
sm.graphics.tsa.plot_pacf(temp["Ventes"], lags=150, ax=ax, title = "PARTIAL AUTOCORRELATION\n" + top_id)
plt.show()
Looking at the PACF plot, select the following lags to try for the model: 1, 2, 3, 4, 5, 6, 7, 10, 13, 15, 34
As we consider we should decide two weeks before, our lag should start at least 14.
lags_to_try = [1, 2, 3, 4, 5, 6, 7, 10, 13, 15, 34]
For the split, we choose to leave one week as we consider it might be the time needed to prepare everything (order ressources to print, order the right amount for delivery, etc.). Of course, this choice is arbitrary and might need to be changed depending on the customer.
# Relod dataset
df = pd.read_csv(os.path.join(path_data, "dataset.gz"), sep=";")
# Set type
df = set_type(df, feat_date="Date")
# Add lags seen
df = add_lags_sma(df, lags_to_try, feat_id='id', feat_date=feat_date)
# Split into train and test
df_train, df_test, y_train, y_test = split_train_test(df=df, feat_date="Date")
check_split(df_train, df_test, feat_date)
# Save
path_save = '../data/processed'
df_train.to_pickle(os.path.join(path_save, "train.pkl"))
df_test.to_pickle(os.path.join(path_save, "test.pkl"))
y_train.to_csv(os.path.join(path_save, "target_train.csv"))
y_test.to_csv(os.path.join(path_save, "target_test.csv"))
Train: 2020-01-02 00:00:00 2023-04-02 00:00:00 Test: 2023-04-03 00:00:00 2023-04-17 00:00:00 Ind in both train and test: []
/Users/benjaminwallyn/Documents/Git/demand-forecast/notebooks/../src/prepare_data.py:68: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_train.drop(columns="Ventes", inplace=True) /Users/benjaminwallyn/Documents/Git/demand-forecast/notebooks/../src/prepare_data.py:69: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_test.drop(columns="Ventes", inplace=True)
df
| Date | Fourni | Ventes | id | sma_1_ventes_lag14 | sma_2_ventes_lag14 | sma_3_ventes_lag14 | sma_4_ventes_lag14 | sma_5_ventes_lag14 | sma_6_ventes_lag14 | sma_7_ventes_lag14 | sma_10_ventes_lag14 | sma_13_ventes_lag14 | sma_15_ventes_lag14 | sma_34_ventes_lag14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-02 | 10.0 | 5.0 | -478139654568867546 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 72 | 2020-01-02 | 7.0 | 2.0 | -876129535061941331 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 71 | 2020-01-02 | 6.0 | 3.0 | 4324517785573311838 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 70 | 2020-01-02 | 2.0 | 2.0 | 5056562842583071429 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 69 | 2020-01-02 | 15.0 | 8.0 | 3525884501202641829 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 83316 | 2023-04-17 | 5.0 | 5.0 | 294028829836586752 | 1.0 | 1.0 | 1.000000 | 1.0 | 1.4 | 1.166667 | 1.000000 | 0.9 | 1.000000 | 1.066667 | 1.235294 |
| 83317 | 2023-04-17 | 21.0 | 21.0 | 6858783929733742556 | 21.0 | 25.5 | 33.333333 | 30.0 | 26.8 | 25.333333 | 24.000000 | 25.6 | 23.538462 | 22.533333 | 21.882353 |
| 83318 | 2023-04-17 | 2.0 | 2.0 | 2132453013379446212 | 1.0 | 1.0 | 1.333333 | 1.0 | 1.2 | 1.333333 | 1.428571 | 1.5 | 1.461538 | 1.466667 | 1.500000 |
| 83305 | 2023-04-17 | 10.0 | 10.0 | -8817594176277759455 | 4.0 | 6.5 | 6.333333 | 6.0 | 6.0 | 7.166667 | 7.285714 | 7.1 | 7.307692 | 7.133333 | 6.823529 |
| 83350 | 2023-04-17 | 10.0 | 10.0 | 1129067540243158717 | 0.0 | 0.0 | 0.000000 | 1.0 | 2.0 | 3.333333 | 3.428571 | 3.2 | 3.923077 | 4.266667 | 5.941176 |
83351 rows × 15 columns